# Import libraries
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
figure(figsize=(5, 20), dpi=300)
from sklearn import preprocessing
from sklearn import manifold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from plotnine import ggplot, aes, geom_line, geom_abline, ggtitle, xlab, ylab
from sklearn.preprocessing import MinMaxScaler
import random
from sklearn import metrics
import plotly.express as px
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
<Figure size 1500x6000 with 0 Axes>
# Load data
features = pd.read_csv("Features.csv")
labels = pd.read_csv("Target.CSV")
Concatenating columnwise
data = pd.concat([features, labels], axis=1)
Replacing and Eliminating certain class value
data['BlcaGrade'] = data['BlcaGrade'].replace('Grade I', 'Stage I')
data['BlcaGrade'] = data['BlcaGrade'].replace('Grade II', 'Stage II')
data['BlcaGrade'] = data['BlcaGrade'].replace('Grade III', 'Stage III')
data['BlcaGrade'] = data['BlcaGrade'].replace('Grade IV', 'Stage IV')
# Replacing
data['BlcaGrade'] = data['BlcaGrade'].replace('Stage I', 'Stage II')
# Eliminating
data = data.drop(data[(data['BlcaGrade'] == 'Stage IV')].index)
data['BlcaGrade'].value_counts()
label_names = ['Stage II', 'Stage III']
data['BlcaGrade'].value_counts()
Stage III 148 Stage II 138 Name: BlcaGrade, dtype: int64
Encoding Labels
0: Stage II
1: Stage III
# label_encoder
label_encoder = preprocessing.LabelEncoder()
data['BlcaGrade']= label_encoder.fit_transform(data['BlcaGrade'])
data['BlcaGrade'].value_counts()
1 148 0 138 Name: BlcaGrade, dtype: int64
Duplicate entry checking
# Outputs the total number of rows in the dataframe.
print("Total entries: ", len(data))
# 'duplicates = df.duplicated()' uses the duplicated method to create a boolean series indicating whether each row is a duplicate or not.
duplicates = data.duplicated()
# 'duplicate_rows = df[duplicates]' uses the boolean series to index the dataframe and obtain a sub-dataframe containing only the duplicate rows.
duplicate_rows = data[duplicates]
# Outputs the number of duplicate rows in the sub-dataframe.
print("Duplicate entries: ", len(duplicate_rows))
Total entries: 286 Duplicate entries: 0
NULL entry checking
print("NULL Entries: ", data.isnull().sum().sum())
NULL Entries: 0
# dimensionality reduction using t-SNE
tsne = manifold.TSNE(n_components=2, random_state=42)
# fit and transform
mnist_tr = tsne.fit_transform(data.drop('BlcaGrade',axis=1))
# create dataframe
cps_df = pd.DataFrame(columns=['Component 1', 'Component 2', 'target'], data=np.column_stack((mnist_tr, data['BlcaGrade'])))
# cast targets column to int
cps_df.loc[:, 'target'] = cps_df.target.astype(int)
fig = px.scatter(
cps_df, x='Component 1', y='Component 2',
color=cps_df.target.astype(str), labels={'color': 'Target Variable'}, width=600, height=400, title="Component 1 VS Component 2 with respect to their labels")
fig.show()
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
X = data.drop('BlcaGrade',axis=1)
y = data['BlcaGrade']
mutual_info = mutual_info_classif(X, y, random_state=0)
mutual_info = pd.Series(mutual_info)
best_cols = SelectKBest(mutual_info_classif, k=100)
best_cols.fit(X, y)
print((X.columns[best_cols.get_support()]))
reducedFeatures = pd.DataFrame(X.columns[best_cols.get_support()])
reducedFeatures.to_csv("ReducedFeatures.csv")
selectedFeatures = list(X.columns[best_cols.get_support()])
Index(['CCDC124', 'ATP1A2', 'FAM120A', 'SLC12A3', 'FGF4', 'ITM2A', 'CD82',
'SF3B2', 'PATZ1', 'CYB5R3', 'TSPO', 'TRIB3', 'SPINT3', 'RGCC', 'TRADD',
'NUTF2', 'BFAR', 'TOX3', 'RAB2A', 'NEFM', 'C19orf53', 'EIF3B', 'BST1',
'RPS13', 'COX7A2', 'BCL6', 'ABCC5', 'CCT4', 'TCP1', 'TRIM24', 'PKN1',
'GLO1', 'ARHGEF6', 'BST2', 'KDM6B', 'RAB25', 'GRHL1', 'KRT85',
'TSPAN31', 'KCNMB4', 'DNAJC1', 'PHLDA1', 'ZNF687', 'ATP8B2', 'CRNN',
'S100A8', 'UBAP2L', 'SCUBE3', 'THY1', 'PRAC1', 'RECQL4', 'FLYWCH2',
'FGF19', 'DAPL1', 'UQCRQ', 'GEM', 'HNRNPK', 'PLEKHF1', 'HSP90B1', 'B2M',
'MS4A7', 'ARF4', 'RAC3', 'ZNF768', 'RNF213', 'SNHG29', 'SPINK6', 'MSC',
'MYLPF', 'SEPTIN9', 'SUMO3', 'IFIT1', 'COL4A1', 'SBSN', 'CGB5',
'S100A6', 'TUBA3C', 'GFPT1', 'RING1', 'MUC21', 'SDHAF1', 'RNU6-171P',
'MT-RNR1', 'IGHJ6', 'LCAT', 'AC016739.1', 'SNORA79B', 'IGKV6D-21',
'WDR46', 'OST4', 'IGHJ3P', 'IGKV3-11', 'IGKV2D-29', 'IGKV2OR22-4',
'RAB44', 'KRT7-AS', 'AC025580.1', 'GFY', 'H2BC3', 'IGHV1-69D'],
dtype='object')
score = list(best_cols.scores_)
score.sort(reverse=True)
score[:100]
# Figure Size
fig = plt.figure(figsize =(10, 7))
# Horizontal Bar Plot
plt.bar(best_cols.get_feature_names_out(X.columns)[:20], score[:20])
plt.xticks(rotation = 45)
# Show Plot
plt.show()
X = data[selectedFeatures].values
y = data['BlcaGrade'].values
k = len(y)/3 # Define the split size of outer cv here
k = int(k)
# Train 1, Train 2, Test 3 - Outer CV 1
outerFold_features_1 = X[:k]
outerFold_labels_1 = y[:k]
# Train 1, Test 2, Train 3 - Outer CV 2
outerFold_features_2 = X[k:2*k]
outerFold_labels_2 = y[k:2*k]
# Test 1, Train 2, Train 3 - Outer CV 3
outerFold_features_3 = X[2*k:(3*k)+1]
outerFold_labels_3 = y[2*k:(3*k)+1]
# Training Features and Labels for 1st Outer CV
features_1 = np.concatenate([outerFold_features_1, outerFold_features_2])
label_1 = np.concatenate([outerFold_labels_1, outerFold_labels_2])
# Training Features and Labels for 2nd Outer CV
features_2 = np.concatenate([outerFold_features_1, outerFold_features_3])
label_2 = np.concatenate([outerFold_labels_1, outerFold_labels_3])
# Training Features and Labels for 3rd Outer CV
features_3 = np.concatenate([outerFold_features_2, outerFold_features_3])
label_3 = np.concatenate([outerFold_labels_2, outerFold_labels_3])
# ---------------------------------------------------------------------------------------------------------------
# This function prints a formatted string to the console with information about the current iteration in inner CV
# ---------------------------------------------------------------------------------------------------------------
def disp(count, feature, p1, p2, trainResult, testResult, selected, clf_arguments1, clf_arguments2):
''' This function prints a formatted string to the console with information about the current iteration in inner CV
args: (9 arguments)
count - Iteration count
feature - Particularly the best feature
p1 - One of the parameter value for classifier
p2 - Another parameter value for classifier
trainResult - Mean training accuracy of a particular iteration in inner cv
testResult - Mean test accuracy of a particular iteration in innver cv
selected - Global variable reference 'featuresOuterFold'; It stores the best features as the iteration goes on for inner cv
clf_arguments1 - Name for one of the parameter of classifier
clf_arguments2 - Name for another parameter of classifier
Returns:
No return value
'''
print("Iteration " + str(count) + " >> Feature: " + str(feature) + "; " + clf_arguments1 + ": " + str(p1) + "; " + clf_arguments2 + ": " + str(p2) + "; Train Accuracy: " + str(round(trainResult, 4)) + "; Test Accuracy: " + str(round(testResult, 4)) + "; Selected Features: " + str(selected))
print("---------------------------------------------------------------------------------------------------------------------------------")
# ----------------------------------------------------------------
# This function is used to perform inner cross-validation with FFS
# ----------------------------------------------------------------
def innerCV(count, features, param1, param2, X, y, cv, clf, clf_arguments1, clf_arguments2):
'''
This function is used to perform inner cross-validation on a machine learning algorithm, where two parameters are being optimized.
The function loops through all possible combinations of parameter values and feature indices.
args:
count - Iteration count
feautres - Reduced features from Mutual Information
param1 - One of the parameter value for classifier
param2 - Another parameter value for classifier
X - Features of a particular outerfold
y - Target Variable of a particular outerfold
cv - K value for inner cv
clf - Class name of classifier
clf_arguments1 - Name for one of the parameter of classifier
clf_arguments2 - Name for another parameter of classifier
Returns:
feature - The best feature index found in the loop
'''
temp = 0.0
for i in features:
for j in param1:
for k in param2:
# This line creates a dictionary called args containing the two parameter values being tested in this iteration of the loop
args = {clf_arguments1:j, clf_arguments2:k}
# This line uses scikit-learn's cross_validate function to perform cross-validation on the machine learning algorithm being tested (clf)
# It passes in the args dictionary as the parameters to the algorithm
scores = cross_validate(clf(**args), X[:, i - 1].reshape(-1, 1), y, cv=cv, return_train_score=True)
# This line checks if the current test score is greater than or equal to the current best test score
if temp <= (float)(scores['test_score'].mean()):
# This line updates the best test score to be the current test score
temp = (float)(scores['test_score'].mean())
# This line stores the mean training score for the current iteration
trainResult = scores['train_score'].mean()
# This line stores the mean test score for the current iteration
testResult = scores['test_score'].mean()
# Select the best feature in each iteration
feature = i
# Select the best parameters in each iteration
p1 = j
# Select the best parameters in each iteration
p2 = k
# This line appends the current best feature to a global list
featuresOuterFold.append(feature)
# This line calls a function called disp to print out information about the current iteration of the program
disp(count, feature, p1, p2, trainResult, testResult, featuresOuterFold, clf_arguments1, clf_arguments2)
# This line returns the best feature index found in the loop
return feature
# ------------------------------------------------------------------------------
# This function implements the outer loop of the nested cross-validation process
# ------------------------------------------------------------------------------
def outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, X, y):
'''
A function to perform the outer fold cross-validation by iterating over the features and hyperparameters to find the best combination of both.
The function takes the following arguments:
args:
clf - Class name of classifier
clf_arguments1 - Name for one of the parameter of classifier
clf_arguments2 - Name for another parameter of classifier
param1 - One of the parameter value for classifier
param2 - Another parameter value for classifier
X - Features of a particular outerfold
y - Target Variable of a particular outerfold
Returns:
No return value
'''
## Create a list of reduced feature numbers from 1 to the total number of selected features
features = [i+1 for i in range(len(selectedFeatures))]
# Remove the last parameter value from params1 list and assign it to param1 variable
param1 = params1.pop()
# Remove the last parameter value from params2 list and assign it to param2 variable
param2 = params2.pop()
# Loop through the outer fold cross-validation iterations, from 1 to 5 (inclusive)
for i in range(1, 6):
if i == 1:
# Call the innerCV function with the first set of features and hyperparameters
feature = innerCV(i, features, param1, param2, X, y, 5, clf, clf_arguments1, clf_arguments2)
else:
# Remove the previously selected feature from the features list
features.remove(feature)
# Remove the last parameter value from params1 list and assign it to param1 variable
param1 = params1.pop()
# Remove the last parameter value from params2 list and assign it to param2 variable
param2 = params2.pop()
# Call the innerCV function with the updated set of features and hyperparameters
# Store the best feature in the feature variable
feature = innerCV(i, features, param1, param2, X, y, 5, clf, clf_arguments1, clf_arguments2)
# -------------------------------------------------------------------------------------------------------
# This function is used to evaluate the performance of a classifier on the outer fold of cross-validation
# -------------------------------------------------------------------------------------------------------
def evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, selectedF, X, y, outerFold_features, outerFold_labels, label_names):
'''
This function is used to evaluate the performance of a classifier on the outer fold of cross-validation.
args:
clf_name: a string specifying the name of the classifier being used
clf: the name of the classifier class
clf_arguments1: a string specifying the name of one of the parameters of the classifier
clf_arguments2: a string specifying the name of another parameter of the classifier
param1: the value for the first parameter of the classifier
param2: the value for the second parameter of the classifier
selectedF: Index list of top 5 features from nested FFS on outer CV training data
X: Values of top 5 features in the dataset
y: Corresponding target variable of the top 5 features
outerFold_features: Unseen feature data from outer fold
outerFold_labels: Unseen corresponding target variable from outer fold
label_names: a list of two strings specifying the names of the two classes in the target variable for plot of confusion matrix
Returns:
No return value
'''
# Mapping features from list to array - Array starts with 0 but list doesn't
selectedF = [x - 1 for x in featuresOuterFold]
# -----------------------------------------------------------------------------------------
# Training classifier with the parameters and features with best accuracy found in inner CV
# -----------------------------------------------------------------------------------------
'''
This block of code checks if the classifier is SVM, and if it is, it sets probability to True while creating a new instance of the classifier.
It creates a dictionary of the classifier arguments (clf_arguments1 and clf_arguments2) and their corresponding values (param1 and param2),
and passes them as keyword arguments to the classifier function, creating an instance of the classifier.
If the classifier is not SVM, it creates a new instance of the classifier using the same dictionary of arguments without setting the probability parameter.
Finally, it fits the created classifier instance on the selected features (selectedF) of the training data (X) and their corresponding labels (y).
'''
if clf_name == 'SVM':
args = {clf_arguments1:param1, clf_arguments2:param2}
classifier = clf(**args, probability=True)
else:
args = {clf_arguments1:param1, clf_arguments2:param2}
classifier = clf(**args)
classifier.fit(X[:, selectedF], y)
print("--------------------------------------------------------------------------------------------------")
print("Training Score on outer fold:", round(classifier.score(X[:, selectedF], y), 6) * 100)
print("--------------------------------------------------------------------------------------------------")
# --------------------------------------------------------------------------------------------------
# Testing on Outer CV
# --------------------------------------------------------------------------------------------------
y_predict = classifier.predict(outerFold_features[:, selectedF])
print("Test Accuracy on outer fold:", round(accuracy_score(outerFold_labels, y_predict), 6) * 100)
print("--------------------------------------------------------------------------------------------------")
print(clf_arguments1 + ": " + str(param1))
print(clf_arguments2 + ": " + str(param2))
print("Features: ", featuresOuterFold)
print("--------------------------------------------------------------------------------------------------")
# --------------------------------------------------------------------------------------------------
# Classification Report
# --------------------------------------------------------------------------------------------------
print(classification_report(outerFold_labels, y_predict, digits=6))
print("--------------------------------------------------------------------------------------------------")
# --------------------------------------------------------------------------------------------------
# ROC AUC Curve and Score
# --------------------------------------------------------------------------------------------------
# Generating a list of 0 with the length of outerFold_labels
ns_probs = [0 for _ in range(len(outerFold_labels))]
# Predicting class probabilities for the test set using the classifier
lr_probs = classifier.predict_proba(outerFold_features[:, selectedF])
# Selecting only the probabilities for the positive class
lr_probs = lr_probs[:, 1]
# Calculating the ROC AUC score for a model that predicts only 0's
ns_auc = roc_auc_score(outerFold_labels, ns_probs)
# Calculating the ROC AUC score for the classifier
lr_auc = roc_auc_score(outerFold_labels, lr_probs)
# Print the no skill ROC AUC score and the ROC AUC score of the classifier
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print(clf_name + ': ROC AUC=%.3f' % (lr_auc))
print("--------------------------------------------------------------------------------------------------")
# Compute the false positive rate, true positive rate and thresholds for the no skill model and the classifier
ns_fpr, ns_tpr, _ = roc_curve(outerFold_labels, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(outerFold_labels, lr_probs)
# Plot the ROC curves for the no skill model and the classifier
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, marker='.', label=clf_name)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.title(clf_name)
plt.show()
print("--------------------------------------------------------------------------------------------------")
print("Confusion Matrix")
print("--------------------------------------------------------------------------------------------------")
ax= plt.subplot()
plt.title(clf_name)
metrics.ConfusionMatrixDisplay(
confusion_matrix = metrics.confusion_matrix(outerFold_labels, y_predict), display_labels = [label_names[0], label_names[1]]).plot(ax=ax, cmap=plt.cm.Greens);
# ------------------------------------------------------------------------------------------------------------
# These lines append various metrics and information to global variables to be used later for further analysis
# ------------------------------------------------------------------------------------------------------------
param_1.append(param1)
param_2.append(param2)
trainScore.append(round(classifier.score(X[:, selectedF], y), 6) * 100)
testScore.append(round(accuracy_score(outerFold_labels, y_predict), 6) * 100)
falsePositiveRate.append(lr_fpr)
truePositiveRate.append(lr_tpr)
aucScore.append(lr_auc)
featureSubset.append(featuresOuterFold)
def combinedROCPlot(clf_name, featureSubset, param_1, param_2, trainScore, testScore, aucScore, falsePositiveRate, truePositiveRate):
'''
This is a function that plots a combined ROC curve for a given classifier over multiple outer folds of cross-validation.
The function takes in the following parameters:
args:
clf_name: the name of the classifier being used
featureSubset: a list containing the selected feature subset for each outer fold
param_1: a list containing the value of the first hyperparameter for the classifier for each outer fold
param_2: a list containing the value of the second hyperparameter for the classifier for each outer fold
trainScore: a list containing the training score for the classifier for each outer fold
testScore: a list containing the testing score for the classifier for each outer fold
aucScore: a list containing the AUC score for the classifier for each outer fold
falsePositiveRate: a list containing the false positive rate for the ROC curve for each outer fold
truePositiveRate: a list containing the true positive rate for the ROC curve for each outer fold
'''
# Plot the No Skill line
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
# Iterate over each outer fold and print out the results
for i in range(0, 3):
print("--------------------------------------------")
print("Outer Fold " + str(i + 1) + " Result")
print("--------------------------------------------")
print("Feature Subset: ", featureSubset[i])
print("Best n_estimator: ", param_1[i])
print("Best max_depth: ", param_2[i])
print("Train Score: ", trainScore[i])
print("Test Score: ", testScore[i])
print("AUC Score: ", aucScore[i])
# Iterate over each outer fold and plot the ROC curve for that fold
for i in range(0, 3):
plt.plot(falsePositiveRate[i], truePositiveRate[i], marker='.', label = clf_name + ' - Outer Fold ' + str(i+1))
# Add axis labels, legend, and display the graph
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
# Global variables
# These lines of code are initializing several empty lists
# which will be used later in the program to store data or results
# Resetting global variable data for other classifiers
param_1 = []
param_2 = []
trainScore = []
testScore = []
falsePositiveRate = []
truePositiveRate = []
aucScore = []
featureSubset = []
# Classifier Name
clf_name = 'Random Forest'
# Classifier's class name
clf = RandomForestClassifier
# Argument 1 name for classifier
clf_arguments1 = 'n_estimators'
# Argument 2 name for classifier
clf_arguments2 = 'max_depth'
# Values of argument 1 parameter list for classifier
params1 = [[5, 10, 15], [20, 25, 30], [35, 40, 45], [50, 55, 60], [65, 150, 200]]
# max_depth parameter list for Random Forest
params2 = [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7]]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_1, label_1)
Iteration 1 >> Feature: 67; n_estimators: 150; max_depth: 6; Train Accuracy: 0.9066; Test Accuracy: 0.6842; Selected Features: [67] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 95; n_estimators: 60; max_depth: 6; Train Accuracy: 0.8276; Test Accuracy: 0.6579; Selected Features: [67, 95] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 81; n_estimators: 35; max_depth: 3; Train Accuracy: 0.6895; Test Accuracy: 0.6632; Selected Features: [67, 95, 81] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 97; n_estimators: 30; max_depth: 4; Train Accuracy: 0.7829; Test Accuracy: 0.6632; Selected Features: [67, 95, 81, 97] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 6; n_estimators: 10; max_depth: 2; Train Accuracy: 0.6618; Test Accuracy: 0.6368; Selected Features: [67, 95, 81, 97, 6] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 10
# Values of argument 2 parameter list for classifier
param2 = 2
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_1, label_1, outerFold_features_3, outerFold_labels_3, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 69.47370000000001
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 60.4167
--------------------------------------------------------------------------------------------------
n_estimators: 10
max_depth: 2
Features: [85, 44, 97, 52, 38]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.595238 0.543478 0.568182 46
1 0.611111 0.660000 0.634615 50
accuracy 0.604167 96
macro avg 0.603175 0.601739 0.601399 96
weighted avg 0.603505 0.604167 0.602783 96
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Random Forest: ROC AUC=0.610
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
params1 = [[5, 10, 15], [40, 25, 30], [35, 20, 45], [50, 55, 60], [90, 100, 180]]
# max_depth parameter list for Random Forest
params2 = [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7]]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_2, label_2)
Iteration 1 >> Feature: 65; n_estimators: 90; max_depth: 6; Train Accuracy: 0.8757; Test Accuracy: 0.665; Selected Features: [65] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 50; n_estimators: 55; max_depth: 6; Train Accuracy: 0.8888; Test Accuracy: 0.6698; Selected Features: [65, 50] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 80; n_estimators: 35; max_depth: 5; Train Accuracy: 0.7683; Test Accuracy: 0.6389; Selected Features: [65, 50, 80] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 73; n_estimators: 25; max_depth: 4; Train Accuracy: 0.7409; Test Accuracy: 0.6335; Selected Features: [65, 50, 80, 73] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 19; n_estimators: 10; max_depth: 3; Train Accuracy: 0.7081; Test Accuracy: 0.6281; Selected Features: [65, 50, 80, 73, 19] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 10
# Values of argument 2 parameter list for classifier
param2 = 3
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_2, label_2, outerFold_features_2, outerFold_labels_2, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 74.8691
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 55.789500000000004
--------------------------------------------------------------------------------------------------
n_estimators: 10
max_depth: 3
Features: [85, 44, 97, 52, 38]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.600000 0.375000 0.461538 48
1 0.538462 0.744681 0.625000 47
accuracy 0.557895 95
macro avg 0.569231 0.559840 0.543269 95
weighted avg 0.569555 0.557895 0.542409 95
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Random Forest: ROC AUC=0.524
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
params1 = [[3, 5, 2], [15, 25, 30], [35, 60, 45], [40, 55, 60], [200,90, 100]]
# max_depth parameter list for Random Forest
params2 = [[1, 2, 3], [2, 5, 4], [3, 4, 5], [4, 5, 6], [4, 8, 7]]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_3, label_3)
Iteration 1 >> Feature: 85; n_estimators: 100; max_depth: 8; Train Accuracy: 0.8285; Test Accuracy: 0.6543; Selected Features: [85] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 44; n_estimators: 60; max_depth: 6; Train Accuracy: 0.877; Test Accuracy: 0.6439; Selected Features: [85, 44] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 97; n_estimators: 45; max_depth: 5; Train Accuracy: 0.8206; Test Accuracy: 0.6386; Selected Features: [85, 44, 97] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 52; n_estimators: 25; max_depth: 5; Train Accuracy: 0.8194; Test Accuracy: 0.6279; Selected Features: [85, 44, 97, 52] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 38; n_estimators: 2; max_depth: 3; Train Accuracy: 0.6454; Test Accuracy: 0.613; Selected Features: [85, 44, 97, 52, 38] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 2
# Values of argument 2 parameter list for classifier
param2 = 3
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_3, label_3, outerFold_features_1, outerFold_labels_1, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 62.30369999999999
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 52.6316
--------------------------------------------------------------------------------------------------
n_estimators: 2
max_depth: 3
Features: [85, 44, 97, 52, 38]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.492063 0.704545 0.579439 44
1 0.593750 0.372549 0.457831 51
accuracy 0.526316 95
macro avg 0.542907 0.538547 0.518635 95
weighted avg 0.546653 0.526316 0.514155 95
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Random Forest: ROC AUC=0.600
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
combinedROCPlot(clf_name, featureSubset, param_1, param_2, trainScore, testScore, aucScore, falsePositiveRate, truePositiveRate)
-------------------------------------------- Outer Fold 1 Result -------------------------------------------- Feature Subset: [67, 95, 81, 97, 6] Best n_estimator: 5 Best max_depth: 5 Train Score: 80.52629999999999 Test Score: 45.8333 AUC Score: 0.485 -------------------------------------------- Outer Fold 2 Result -------------------------------------------- Feature Subset: [65, 50, 80, 73, 19] Best n_estimator: 5 Best max_depth: 4 Train Score: 80.6283 Test Score: 50.5263 AUC Score: 0.5088652482269503 -------------------------------------------- Outer Fold 3 Result -------------------------------------------- Feature Subset: [85, 44, 97, 52, 38] Best n_estimator: 3 Best max_depth: 2 Train Score: 66.4921 Test Score: 47.3684 AUC Score: 0.4044117647058823
listB = [32, 45, 53, 21, 55]
res = list(map(selectedFeatures.__getitem__, listB))
res.append('BlcaGrade')
sns.pairplot(data[res], hue='BlcaGrade', palette='tab10')
<seaborn.axisgrid.PairGrid at 0x7f5ea9050f70>
# Global variables
# These lines of code are initializing several empty lists
# which will be used later in the program to store data or results
# Resetting global variable data for other classifiers
param_1 = []
param_2 = []
trainScore = []
testScore = []
falsePositiveRate = []
truePositiveRate = []
aucScore = []
featureSubset = []
# Classifier Name
clf_name = 'SVM'
# Classifier's class name
clf = SVC
# Argument 1 name for classifier
clf_arguments1 = 'C'
# Argument 2 name for classifier
clf_arguments2 = 'kernel'
# Values of argument 1 parameter list for classifier
params1 = [[0.01, 0.1], [0.001,1], [0.002, 1], [1, 0.003 ], [0.1,1]]
# Values of argument 2 parameter list for classifier
params2 = [['rbf'], ['rbf'], ['rbf'], ['rbf'], ['rbf']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_1, label_1)
Iteration 1 >> Feature: 37; C: 1; kernel: rbf; Train Accuracy: 0.6211; Test Accuracy: 0.6263; Selected Features: [37] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 97; C: 1; kernel: rbf; Train Accuracy: 0.6303; Test Accuracy: 0.6; Selected Features: [37, 97] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 81; C: 1; kernel: rbf; Train Accuracy: 0.6224; Test Accuracy: 0.6; Selected Features: [37, 97, 81] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 6; C: 1; kernel: rbf; Train Accuracy: 0.5961; Test Accuracy: 0.5895; Selected Features: [37, 97, 81, 6] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 10; C: 0.1; kernel: rbf; Train Accuracy: 0.5711; Test Accuracy: 0.5632; Selected Features: [37, 97, 81, 6, 10] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 1
# Values of argument 2 parameter list for classifier
param2 = 'rbf'
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_1, label_1, outerFold_features_3, outerFold_labels_3, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 62.1053
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 55.2083
--------------------------------------------------------------------------------------------------
C: 1
kernel: rbf
Features: [37, 97, 81, 6, 10]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.545455 0.391304 0.455696 46
1 0.555556 0.700000 0.619469 50
accuracy 0.552083 96
macro avg 0.550505 0.545652 0.537583 96
weighted avg 0.550715 0.552083 0.540995 96
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
SVM: ROC AUC=0.473
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
params1 = [[0.01, 0.02], [0.03,0.1], [0.02,0.0001], [0.001,0.002], [0.3,0.003]]
# Values of argument 2 parameter list for classifier
params2 = [['rbf'], ['rbf'], ['rbf'], ['rbf'], ['rbf']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_2, label_2)
Iteration 1 >> Feature: 48; C: 0.3; kernel: rbf; Train Accuracy: 0.644; Test Accuracy: 0.6177; Selected Features: [48] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 100; C: 0.002; kernel: rbf; Train Accuracy: 0.5288; Test Accuracy: 0.5287; Selected Features: [48, 100] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 99; C: 0.0001; kernel: rbf; Train Accuracy: 0.5288; Test Accuracy: 0.5287; Selected Features: [48, 100, 99] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 27; C: 0.1; kernel: rbf; Train Accuracy: 0.5955; Test Accuracy: 0.5552; Selected Features: [48, 100, 99, 27] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 98; C: 0.02; kernel: rbf; Train Accuracy: 0.5288; Test Accuracy: 0.5287; Selected Features: [48, 100, 99, 27, 98] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 0.3
# Values of argument 2 parameter list for classifier
param2 = 'rbf'
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_2, label_2, outerFold_features_2, outerFold_labels_2, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 52.8796
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 49.4737
--------------------------------------------------------------------------------------------------
C: 0.3
kernel: rbf
Features: [20, 40, 100, 67, 99]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.000000 0.000000 0.000000 48
1 0.494737 1.000000 0.661972 47
accuracy 0.494737 95
macro avg 0.247368 0.500000 0.330986 95
weighted avg 0.244765 0.494737 0.327502 95
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
SVM: ROC AUC=0.520
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
params1 = [[0.01,0.002], [0.03,0.1], [0.02,0.01], [0.001,0.3], [0.3,0.002]]
# Values of argument 2 parameter list for classifier
params2 = [['rbf'], ['rbf'], ['rbf'], ['rbf'], ['rbf']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_3, label_3)
Iteration 1 >> Feature: 20; C: 0.3; kernel: rbf; Train Accuracy: 0.6152; Test Accuracy: 0.5915; Selected Features: [20] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 40; C: 0.3; kernel: rbf; Train Accuracy: 0.5955; Test Accuracy: 0.581; Selected Features: [20, 40] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 100; C: 0.01; kernel: rbf; Train Accuracy: 0.5079; Test Accuracy: 0.5078; Selected Features: [20, 40, 100] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 67; C: 0.1; kernel: rbf; Train Accuracy: 0.6087; Test Accuracy: 0.5552; Selected Features: [20, 40, 100, 67] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 99; C: 0.002; kernel: rbf; Train Accuracy: 0.5079; Test Accuracy: 0.5078; Selected Features: [20, 40, 100, 67, 99] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 0.3
# Values of argument 2 parameter list for classifier
param2 = 'rbf'
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_3, label_3, outerFold_features_1, outerFold_labels_1, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 51.3089
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 50.5263
--------------------------------------------------------------------------------------------------
C: 0.3
kernel: rbf
Features: [20, 40, 100, 67, 99]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.363636 0.090909 0.145455 44
1 0.523810 0.862745 0.651852 51
accuracy 0.505263 95
macro avg 0.443723 0.476827 0.398653 95
weighted avg 0.449624 0.505263 0.417310 95
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
SVM: ROC AUC=0.645
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
combinedROCPlot(clf_name, featureSubset, param_1, param_2, trainScore, testScore, aucScore, falsePositiveRate, truePositiveRate)
-------------------------------------------- Outer Fold 1 Result -------------------------------------------- Feature Subset: [37, 97, 81, 6, 10] Best n_estimator: 1 Best max_depth: rbf Train Score: 62.1053 Test Score: 55.2083 AUC Score: 0.47304347826086957 -------------------------------------------- Outer Fold 2 Result -------------------------------------------- Feature Subset: [48, 100, 99, 27, 98] Best n_estimator: 0.3 Best max_depth: rbf Train Score: 57.0681 Test Score: 49.4737 AUC Score: 0.5035460992907801 -------------------------------------------- Outer Fold 3 Result -------------------------------------------- Feature Subset: [20, 40, 100, 67, 99] Best n_estimator: 0.3 Best max_depth: rbf Train Score: 51.3089 Test Score: 50.5263 AUC Score: 0.645276292335116
listB = [78, 68, 48, 59, 61]
res = list(map(selectedFeatures.__getitem__, listB))
res.append('BlcaGrade')
sns.pairplot(data[res], hue='BlcaGrade', palette='tab10')
# Global variables
# These lines of code are initializing several empty lists
# which will be used later in the program to store data or results
# Resetting global variable data for other classifiers
param_1 = []
param_2 = []
trainScore = []
testScore = []
falsePositiveRate = []
truePositiveRate = []
aucScore = []
featureSubset = []
# Classifier Name
clf_name = 'Gradient Boosting'
# Classifier's class name
clf = GradientBoostingClassifier
# Argument 1 name for classifier
clf_arguments1 = 'n_estimators'
# Argument 2 name for classifier
clf_arguments2 = 'max_depth'
# Values of argument 1 parameter list for classifier
params1 = [[2, 3], [5, 25], [30, 35], [50, 60], [70, 100]]
# Values of argument 2 parameter list for classifier
params2 = [[1, 2], [2, 3], [3, 4], [4, 7], [5, 6]]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_1, label_1)
Iteration 1 >> Feature: 66; n_estimators: 100; max_depth: 6; Train Accuracy: 1.0; Test Accuracy: 0.6368; Selected Features: [66] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 93; n_estimators: 50; max_depth: 4; Train Accuracy: 0.8921; Test Accuracy: 0.6368; Selected Features: [66, 93] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 34; n_estimators: 35; max_depth: 4; Train Accuracy: 0.9342; Test Accuracy: 0.6263; Selected Features: [66, 93, 34] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 81; n_estimators: 5; max_depth: 2; Train Accuracy: 0.6829; Test Accuracy: 0.6368; Selected Features: [66, 93, 34, 81] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 22; n_estimators: 3; max_depth: 2; Train Accuracy: 0.6592; Test Accuracy: 0.6211; Selected Features: [66, 93, 34, 81, 22] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 5
# Values of argument 2 parameter list for classifier
param2 = 2
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_1, label_1, outerFold_features_3, outerFold_labels_3, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 70.52629999999999
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 44.7917
--------------------------------------------------------------------------------------------------
n_estimators: 5
max_depth: 2
Features: [66, 93, 34, 81, 22]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.405405 0.326087 0.361446 46
1 0.474576 0.560000 0.513761 50
accuracy 0.447917 96
macro avg 0.439991 0.443043 0.437604 96
weighted avg 0.441432 0.447917 0.440777 96
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Gradient Boosting: ROC AUC=0.432
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
params1 = [[5, 2], [3, 25], [30, 35], [80, 90], [100, 150]]
# Values of argument 2 parameter list for classifier
params2 = [[3, 5], [7, 4], [6, 3], [1, 6], [5, 6]]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_2, label_2)
Iteration 1 >> Feature: 49; n_estimators: 150; max_depth: 6; Train Accuracy: 1.0; Test Accuracy: 0.6495; Selected Features: [49] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 71; n_estimators: 90; max_depth: 6; Train Accuracy: 1.0; Test Accuracy: 0.6385; Selected Features: [49, 71] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 67; n_estimators: 35; max_depth: 3; Train Accuracy: 0.75; Test Accuracy: 0.6339; Selected Features: [49, 71, 67] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 47; n_estimators: 25; max_depth: 7; Train Accuracy: 0.9816; Test Accuracy: 0.6329; Selected Features: [49, 71, 67, 47] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 36; n_estimators: 5; max_depth: 5; Train Accuracy: 0.7396; Test Accuracy: 0.6393; Selected Features: [49, 71, 67, 47, 36] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 5
# Values of argument 2 parameter list for classifier
param2 = 5
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_2, label_2, outerFold_features_2, outerFold_labels_2, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 89.0052
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 52.6316
--------------------------------------------------------------------------------------------------
n_estimators: 5
max_depth: 5
Features: [49, 71, 67, 47, 36]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.551724 0.333333 0.415584 48
1 0.515152 0.723404 0.601770 47
accuracy 0.526316 95
macro avg 0.533438 0.528369 0.508677 95
weighted avg 0.533630 0.526316 0.507697 95
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Gradient Boosting: ROC AUC=0.514
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
params1 = [[3, 5], [10, 25], [40, 50], [70, 60], [150, 200]]
# Values of argument 2 parameter list for classifier
params2 = [[2, 4], [3, 4], [3, 7], [4, 8], [5, 6]]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_3, label_3)
Iteration 1 >> Feature: 84; n_estimators: 200; max_depth: 6; Train Accuracy: 0.8429; Test Accuracy: 0.6543; Selected Features: [84] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 24; n_estimators: 70; max_depth: 4; Train Accuracy: 0.9791; Test Accuracy: 0.6335; Selected Features: [84, 24] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 96; n_estimators: 50; max_depth: 3; Train Accuracy: 0.8521; Test Accuracy: 0.6439; Selected Features: [84, 24, 96] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 42; n_estimators: 25; max_depth: 4; Train Accuracy: 0.8665; Test Accuracy: 0.649; Selected Features: [84, 24, 96, 42] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 86; n_estimators: 5; max_depth: 4; Train Accuracy: 0.7645; Test Accuracy: 0.6178; Selected Features: [84, 24, 96, 42, 86] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 25
# Values of argument 2 parameter list for classifier
param2 = 4
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_3, label_3, outerFold_features_1, outerFold_labels_1, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 96.3351
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 48.4211
--------------------------------------------------------------------------------------------------
n_estimators: 25
max_depth: 4
Features: [84, 24, 96, 42, 86]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.450980 0.522727 0.484211 44
1 0.522727 0.450980 0.484211 51
accuracy 0.484211 95
macro avg 0.486854 0.486854 0.484211 95
weighted avg 0.489497 0.484211 0.484211 95
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Gradient Boosting: ROC AUC=0.498
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
combinedROCPlot(clf_name, featureSubset, param_1, param_2, trainScore, testScore, aucScore, falsePositiveRate, truePositiveRate)
-------------------------------------------- Outer Fold 1 Result -------------------------------------------- Feature Subset: [66, 93, 34, 81, 22] Best n_estimator: 3 Best max_depth: 2 Train Score: 70.52629999999999 Test Score: 44.7917 AUC Score: 0.4297826086956522 -------------------------------------------- Outer Fold 2 Result -------------------------------------------- Feature Subset: [66, 93, 34, 81, 22] Best n_estimator: 5 Best max_depth: 2 Train Score: 70.52629999999999 Test Score: 44.7917 AUC Score: 0.4321739130434783 -------------------------------------------- Outer Fold 3 Result -------------------------------------------- Feature Subset: [49, 71, 67, 47, 36] Best n_estimator: 5 Best max_depth: 5 Train Score: 89.0052 Test Score: 52.6316 AUC Score: 0.5135195035460993
listB = [39, 19, 84, 26, 40]
res = list(map(selectedFeatures.__getitem__, listB))
res.append('BlcaGrade')
sns.pairplot(data[res], hue='BlcaGrade', palette='tab10')
<seaborn.axisgrid.PairGrid at 0x7f5e999151f0>
# Global variables
# These lines of code are initializing several empty lists
# which will be used later in the program to store data or results
# Resetting global variable data for other classifiers
param_1 = []
param_2 = []
trainScore = []
testScore = []
falsePositiveRate = []
truePositiveRate = []
aucScore = []
featureSubset = []
# Classifier Name
clf_name = 'KNN'
# Classifier's class name
clf = KNeighborsClassifier
# Argument 1 name for classifier
clf_arguments1 = 'n_neighbors'
# Argument 2 name for classifier
clf_arguments2 = 'weights'
# Values of argument 1 parameter list for classifier
params1 = [[2, 3], [3, 4], [4, 5], [5, 6], [6, 7]]
# Values of argument 2 parameter list for classifier
params2 = [['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_1, label_1)
Iteration 1 >> Feature: 66; n_neighbors: 7; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.6632; Selected Features: [66] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 81; n_neighbors: 5; weights: uniform; Train Accuracy: 0.725; Test Accuracy: 0.6474; Selected Features: [66, 81] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 96; n_neighbors: 4; weights: distance; Train Accuracy: 0.9789; Test Accuracy: 0.6474; Selected Features: [66, 81, 96] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 22; n_neighbors: 3; weights: uniform; Train Accuracy: 0.7579; Test Accuracy: 0.6526; Selected Features: [66, 81, 96, 22] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 63; n_neighbors: 3; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.6263; Selected Features: [66, 81, 96, 22, 63] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 3
# Values of argument 2 parameter list for classifier
param2 = 'uniform'
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_1, label_1, outerFold_features_3, outerFold_labels_3, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 75.7895
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 51.0417
--------------------------------------------------------------------------------------------------
n_neighbors: 3
weights: uniform
Features: [66, 81, 96, 22, 63]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.489362 0.500000 0.494624 46
1 0.530612 0.520000 0.525253 50
accuracy 0.510417 96
macro avg 0.509987 0.510000 0.509938 96
weighted avg 0.510846 0.510417 0.510576 96
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
KNN: ROC AUC=0.492
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
params1 = [[2, 5], [7, 3], [6, 12], [8, 9], [4, 1]]
# Values of argument 2 parameter list for classifier
params2 = [['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_2, label_2)
Iteration 1 >> Feature: 49; n_neighbors: 1; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.6495; Selected Features: [49] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 47; n_neighbors: 8; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.654; Selected Features: [49, 47] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 71; n_neighbors: 6; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.6437; Selected Features: [49, 47, 71] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 23; n_neighbors: 3; weights: uniform; Train Accuracy: 0.7958; Test Accuracy: 0.6698; Selected Features: [49, 47, 71, 23] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 72; n_neighbors: 5; weights: uniform; Train Accuracy: 0.7474; Test Accuracy: 0.639; Selected Features: [49, 47, 71, 23, 72] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 5
# Values of argument 2 parameter list for classifier
param2 = 'uniform'
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_2, label_2, outerFold_features_2, outerFold_labels_2, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 71.2042
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 54.736799999999995
--------------------------------------------------------------------------------------------------
n_neighbors: 5
weights: uniform
Features: [49, 47, 71, 23, 72]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.551020 0.562500 0.556701 48
1 0.543478 0.531915 0.537634 47
accuracy 0.547368 95
macro avg 0.547249 0.547207 0.547168 95
weighted avg 0.547289 0.547368 0.547268 95
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
KNN: ROC AUC=0.514
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
params1 = [[2, 4], [5, 8], [1, 7], [3, 6], [9, 11]]
# Values of argument 2 parameter list for classifier
params2 = [['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_3, label_3)
Iteration 1 >> Feature: 51; n_neighbors: 9; weights: uniform; Train Accuracy: 0.6924; Test Accuracy: 0.6339; Selected Features: [51] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 11; n_neighbors: 3; weights: uniform; Train Accuracy: 0.7841; Test Accuracy: 0.6491; Selected Features: [51, 11] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 84; n_neighbors: 1; weights: distance; Train Accuracy: 0.8272; Test Accuracy: 0.6548; Selected Features: [51, 11, 84] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 24; n_neighbors: 8; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.6229; Selected Features: [51, 11, 84, 24] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 28; n_neighbors: 4; weights: uniform; Train Accuracy: 0.7579; Test Accuracy: 0.6285; Selected Features: [51, 11, 84, 24, 28] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 4
# Values of argument 2 parameter list for classifier
param2 = 'uniform'
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_3, label_3, outerFold_features_1, outerFold_labels_1, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 70.6806
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 48.4211
--------------------------------------------------------------------------------------------------
n_neighbors: 4
weights: uniform
Features: [51, 11, 84, 24, 28]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.459016 0.636364 0.533333 44
1 0.529412 0.352941 0.423529 51
accuracy 0.484211 95
macro avg 0.494214 0.494652 0.478431 95
weighted avg 0.496808 0.484211 0.474386 95
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
KNN: ROC AUC=0.519
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
combinedROCPlot(clf_name, featureSubset, param_1, param_2, trainScore, testScore, aucScore, falsePositiveRate, truePositiveRate)
-------------------------------------------- Outer Fold 1 Result -------------------------------------------- Feature Subset: [66, 81, 96, 22, 63] Best n_estimator: 3 Best max_depth: uniform Train Score: 75.7895 Test Score: 51.0417 AUC Score: 0.4919565217391304 -------------------------------------------- Outer Fold 2 Result -------------------------------------------- Feature Subset: [49, 47, 71, 23, 72] Best n_estimator: 5 Best max_depth: uniform Train Score: 71.2042 Test Score: 54.736799999999995 AUC Score: 0.5135195035460993 -------------------------------------------- Outer Fold 3 Result -------------------------------------------- Feature Subset: [49, 47, 71, 23, 72] Best n_estimator: 3 Best max_depth: uniform Train Score: 78.534 Test Score: 48.4211 AUC Score: 0.47340425531914887
listB = [66, 33, 84, 55, 99]
res = list(map(selectedFeatures.__getitem__, listB))
res.append('BlcaGrade')
sns.pairplot(data[res], hue='BlcaGrade', palette='tab10')
<seaborn.axisgrid.PairGrid at 0x7f97951cc940>
# Global variables
# These lines of code are initializing several empty lists
# which will be used later in the program to store data or results
# Resetting global variable data for other classifiers
param_1 = []
param_2 = []
trainScore = []
testScore = []
falsePositiveRate = []
truePositiveRate = []
aucScore = []
featureSubset = []
# Classifier Name
clf_name = 'Logistic Regression'
# Classifier's class name
clf = LogisticRegression
# Argument 1 name for classifier
clf_arguments1 = 'solver'
# Argument 2 name for classifier
clf_arguments2 = 'penalty'
# Values of argument 1 parameter list for classifier
params1 =params1 = [['lbfgs', 'liblinear'], ['lbfgs', 'liblinear'], ['lbfgs', 'liblinear'], ['lbfgs', 'liblinear'], ['lbfgs', 'liblinear']]
# Values of argument 2 parameter list for classifier
params2 = [['l2'], ['l2'], ['l2'], [ 'l2'], ['l2']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_1, label_1)
Iteration 1 >> Feature: 34; solver: liblinear; penalty: l2; Train Accuracy: 0.6184; Test Accuracy: 0.6105; Selected Features: [34] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 97; solver: liblinear; penalty: l2; Train Accuracy: 0.5908; Test Accuracy: 0.5947; Selected Features: [34, 97] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 26; solver: liblinear; penalty: l2; Train Accuracy: 0.5724; Test Accuracy: 0.5684; Selected Features: [34, 97, 26] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 33; solver: lbfgs; penalty: l2; Train Accuracy: 0.5645; Test Accuracy: 0.5632; Selected Features: [34, 97, 26, 33] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 21; solver: liblinear; penalty: l2; Train Accuracy: 0.5421; Test Accuracy: 0.5632; Selected Features: [34, 97, 26, 33, 21] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 'lbfgs'
# Values of argument 2 parameter list for classifier
param2 = 'l2'
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_1, label_1, outerFold_features_3, outerFold_labels_3, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 62.1053
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 47.9167
--------------------------------------------------------------------------------------------------
solver: lbfgs
penalty: l2
Features: [34, 97, 26, 33, 21]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.458333 0.478261 0.468085 46
1 0.500000 0.480000 0.489796 50
accuracy 0.479167 96
macro avg 0.479167 0.479130 0.478941 96
weighted avg 0.480035 0.479167 0.479393 96
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Logistic Regression: ROC AUC=0.477
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
params1 = [['lbfgs', 'liblinear'], ['lbfgs', 'sag'], ['lbfgs', 'sag'], ['lbfgs', 'sag'], ['lbfgs', 'sag']]
# Values of argument 2 parameter list for classifier
params2 = [['l2'], ['l2'], ['l2'], ['l2'], ['l2']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_2, label_2)
Iteration 1 >> Feature: 26; solver: lbfgs; penalty: l2; Train Accuracy: 0.5994; Test Accuracy: 0.5858; Selected Features: [26] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 37; solver: lbfgs; penalty: l2; Train Accuracy: 0.572; Test Accuracy: 0.576; Selected Features: [26, 37] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 65; solver: lbfgs; penalty: l2; Train Accuracy: 0.5589; Test Accuracy: 0.5655; Selected Features: [26, 37, 65] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 50; solver: sag; penalty: l2; Train Accuracy: 0.5249; Test Accuracy: 0.5653; Selected Features: [26, 37, 65, 50] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 85; solver: liblinear; penalty: l2; Train Accuracy: 0.5563; Test Accuracy: 0.5652; Selected Features: [26, 37, 65, 50, 85] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 'lbfgs'
# Values of argument 2 parameter list for classifier
param2 = 'l2'
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_2, label_2, outerFold_features_2, outerFold_labels_2, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 64.9215
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 52.6316
--------------------------------------------------------------------------------------------------
solver: lbfgs
penalty: l2
Features: [26, 37, 65, 50, 85]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.542857 0.395833 0.457831 48
1 0.516667 0.659574 0.579439 47
accuracy 0.526316 95
macro avg 0.529762 0.527704 0.518635 95
weighted avg 0.529900 0.526316 0.517995 95
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Logistic Regression: ROC AUC=0.524
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
params1 = [['lbfgs', 'liblinear'], ['lbfgs', 'liblinear'], ['lbfgs', 'liblinear'], ['lbfgs', 'liblinear'], ['lbfgs', 'liblinear']]
# Values of argument 2 parameter list for classifier
params2 = [['l2'], ['l2'], ['l2'], ['l2'], ['l2']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_3, label_3)
Iteration 1 >> Feature: 47; solver: lbfgs; penalty: l2; Train Accuracy: 0.5641; Test Accuracy: 0.586; Selected Features: [47] --------------------------------------------------------------------------------------------------------------------------------- Iteration 2 >> Feature: 34; solver: liblinear; penalty: l2; Train Accuracy: 0.5877; Test Accuracy: 0.5812; Selected Features: [47, 34] --------------------------------------------------------------------------------------------------------------------------------- Iteration 3 >> Feature: 16; solver: lbfgs; penalty: l2; Train Accuracy: 0.5642; Test Accuracy: 0.5707; Selected Features: [47, 34, 16] --------------------------------------------------------------------------------------------------------------------------------- Iteration 4 >> Feature: 25; solver: liblinear; penalty: l2; Train Accuracy: 0.551; Test Accuracy: 0.5549; Selected Features: [47, 34, 16, 25] --------------------------------------------------------------------------------------------------------------------------------- Iteration 5 >> Feature: 38; solver: liblinear; penalty: l2; Train Accuracy: 0.5523; Test Accuracy: 0.5547; Selected Features: [47, 34, 16, 25, 38] ---------------------------------------------------------------------------------------------------------------------------------
# Values of argument 1 parameter list for classifier
param1 = 'lbfgs'
# Values of argument 2 parameter list for classifier
param2 = 'l2'
evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_3, label_3, outerFold_features_1, outerFold_labels_1, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 61.780100000000004
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 60.0
--------------------------------------------------------------------------------------------------
solver: lbfgs
penalty: l2
Features: [47, 34, 16, 25, 38]
--------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.593750 0.431818 0.500000 44
1 0.603175 0.745098 0.666667 51
accuracy 0.600000 95
macro avg 0.598462 0.588458 0.583333 95
weighted avg 0.598810 0.600000 0.589474 95
--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Logistic Regression: ROC AUC=0.603
--------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------- Confusion Matrix --------------------------------------------------------------------------------------------------
combinedROCPlot(clf_name, featureSubset, param_1, param_2, trainScore, testScore, aucScore, falsePositiveRate, truePositiveRate)
-------------------------------------------- Outer Fold 1 Result -------------------------------------------- Feature Subset: [34, 97, 26, 33, 21] Best n_estimator: lbfgs Best max_depth: l2 Train Score: 62.1053 Test Score: 47.9167 AUC Score: 0.4769565217391304 -------------------------------------------- Outer Fold 2 Result -------------------------------------------- Feature Subset: [26, 37, 65, 50, 85] Best n_estimator: lbfgs Best max_depth: l2 Train Score: 64.9215 Test Score: 52.6316 AUC Score: 0.5243794326241135 -------------------------------------------- Outer Fold 3 Result -------------------------------------------- Feature Subset: [47, 34, 16, 25, 38] Best n_estimator: lbfgs Best max_depth: l2 Train Score: 61.780100000000004 Test Score: 60.0 AUC Score: 0.6033868092691621
listB = [78, 68, 48, 59, 61]
res = list(map(selectedFeatures.__getitem__, listB))
res.append('BlcaGrade')
sns.pairplot(data[res], hue='BlcaGrade', palette='tab10')
<seaborn.axisgrid.PairGrid at 0x7f979513b4f0>